"""Regexes to look for bugs 🐛🐛."""
from contextlib import contextmanager
import dataclasses
from enum import Enum
import functools
from functools import lru_cache
from pathlib import Path
import re
import time
import typing
from urllib import parse

from cki_lib import misc
from cki_lib.logger import get_logger
from cki_lib.session import get_session
from cki_lib.timeout import func_timeout
from datawarehouse import objects
import prometheus_client as prometheus

from . import cache
from . import compiledregex
from . import dwobject
from . import settings

LOGGER = get_logger(__name__)
SESSION = get_session(
    __name__,
    timeout=settings.DOWNLOAD_LOG_TIMEOUT,
    retry_args={"total": settings.DOWNLOAD_LOG_RETRIES},
)

METRIC_REGEX_SEARCH_TIME = prometheus.Histogram(
    'regex_search_time_seconds',
    'Time spent looking through a log file with the regexes'
)
METRIC_REGEX_MATCH_TIME = prometheus.Summary(
    'regex_match_time_seconds',
    'Time spent matching, grouped by regex.',
    ['regex_id']
)

UNSUCCESSFUL_STATUSES = ['ERROR', 'FAIL']

TriageStatus = Enum('TriageStatus', ['INCOMPLETE', 'NOT_NEEDED', 'SUCCESS'])
MatchStatus = Enum('MatchStatus', ['NOT_APPLICABLE', 'NO_MATCH', 'PARTIAL_MATCH', 'FULL_MATCH'])


@dataclasses.dataclass
class LogFile:
    """One log file belonging to a KCIDB object."""

    dw_obj: objects.RESTObject
    dw_test: objects.KCIDBTest | None = None  # only provided for TestResults
    name: str = ''
    url: str = ''


@dataclasses.dataclass
class RegexMatch:
    """A regular expression match."""

    status: MatchStatus
    log_file: LogFile
    regex: compiledregex.CompiledIssueRegex


@dataclasses.dataclass
class TriageResult:
    """Result of triaging an KCIDB object."""

    status: TriageStatus
    matches: list[RegexMatch]


def search_url(url: str, regex_id: str, text_match: re.Pattern[str]) -> bool:
    """Grep a remote file for a regex."""
    if (text := download(url)) is None:
        LOGGER.debug(" file text wasn't retrieved therefore didn't match: %s",
                     text_match)
        return False

    with _match_time_measure(regex_id):
        timeout_secs = settings.REGEX_TIMEOUT.total_seconds()
        if not func_timeout(text_match.search, timeout_secs, args=[text]):
            LOGGER.debug(' file text (len=%d) did not match: %s',
                         len(text), text_match)
            return False

    return True


@METRIC_REGEX_SEARCH_TIME.time()
def triage(dw_obj: typing.Any, issueregex_ids: list[int]) -> TriageResult:
    """Use regexes to find failures."""
    # incomplete object
    if (dw_obj.status if dw_obj.type == 'test' else dw_obj.valid) is None:
        return TriageResult(TriageStatus.INCOMPLETE, [])

    # no triage needed
    if dw_obj.type in {'checkout', 'build'} and dw_obj.valid is True:
        return TriageResult(TriageStatus.NOT_NEEDED, [])
    if dw_obj.type == 'test' and dw_obj.status not in UNSUCCESSFUL_STATUSES and not any(
            r.get('status') in UNSUCCESSFUL_STATUSES
            for r in (dw_obj.misc or {}).get('results', [])):
        return TriageResult(TriageStatus.NOT_NEEDED, [])

    # determine log files
    log_files = []
    if dw_obj.type == 'checkout':
        log_files += _log_files(dw_obj, 'merge.log')
    elif dw_obj.type == 'build':
        log_files += _log_files(dw_obj, 'build.log')
    elif dw_obj.type == "test":
        log_files += _log_files(dw_obj, "main-output.log")
        log_files += misc.flattened([
            _log_files(
                dw_obj,
                "main-output.log",
                dw_obj_child=dwobject.from_attrs(
                    "testresult",
                    attrs={
                        **result,
                        "misc": {
                            **result.get("misc", {}),
                            "test_name": dw_obj.comment,
                            "related_build": dw_obj.misc.get("related_build"),
                            "related_checkout": dw_obj.misc.get("related_checkout"),
                        },
                    },
                ),
                dw_test=dw_obj,
            )
            for result in (dw_obj.misc or {}).get("results", [])
            if result.get("status") in UNSUCCESSFUL_STATUSES
        ])
    LOGGER.debug('There are %d log files to check', len(log_files))

    # no reason to look at regexes if there are no log files to triage
    if not log_files:
        return TriageResult(TriageStatus.SUCCESS, [])

    # really triage
    regexes = compiledregex.get_compiled_issueregexes(issueregex_ids)
    cached_search_url = functools.cache(search_url)
    return TriageResult(TriageStatus.SUCCESS, [
        RegexMatch(status, log_file, regex)
        for log_file in sorted(log_files, key=lambda log_file: log_file.url)
        for regex in regexes
        if (status := match(log_file, regex, cached_search_url)) != MatchStatus.NOT_APPLICABLE
    ])


def get_issueoccurrence_from_match(regex_match: RegexMatch) -> dict:
    """Given a regex_match, return an issueoccurrence.

    Issueoccurrences will later be placed to the issueoccurrences list in the kcidb_all file.
    This method will also add build_id, checkout_id, test_id and testresult_id, depending on the
    type of DW object being processed.
    Note: checkout_id is not set if related_checkout is not present, this is typical when not
    triaging from kcidb_all file.
    """
    issueoccurrence = {'issue': regex_match.regex.issue.copy()}
    # Set the required fields
    issueoccurrence.update({
        'build_id': None,
        'checkout_id': None,
        'test_id': None,
        'testresult_id': None
    })

    # Set only ID of the object being processed
    issueoccurrence[f"{regex_match.log_file.dw_obj.type}_id"] = regex_match.log_file.dw_obj.get_id()
    if isinstance(regex_match.log_file.dw_obj, objects.KCIDBTestResult):
        # Set test_id for KCIDBTestResult
        issueoccurrence['test_id'] = regex_match.log_file.dw_test.get_id()
    return issueoccurrence


def _match_build(
    build: objects.KCIDBBuild,
    regex: compiledregex.CompiledIssueRegex,
) -> bool:
    architecture_match = regex.architecture_match
    kpet_tree_name_match = regex.kpet_tree_name_match
    package_name_match = regex.package_name_match

    if architecture_match and not (
            build and
            build.architecture and
            architecture_match.search(build.architecture)):
        LOGGER.debug(' architecture "%s" did not match: %s',
                     build.architecture if build else None,
                     architecture_match)
        return False

    if kpet_tree_name_match and not (
            build and
            build.misc.get('kpet_tree_name') and
            kpet_tree_name_match.search(build.misc['kpet_tree_name'])):
        LOGGER.debug(' kpet tree name "%s" did not match: %s',
                     build.misc.get('kpet_tree_name') if build else None,
                     kpet_tree_name_match)
        return False

    if package_name_match and not (
            build and
            build.misc.get('package_name') and
            package_name_match.fullmatch(build.misc['package_name'])):
        LOGGER.debug(' package name "%s" did not match: %s',
                     build.misc.get('package_name') if build else None,
                     package_name_match)
        return False
    return True


def _match_checkout(
    checkout: objects.KCIDBCheckout,
    regex: compiledregex.CompiledIssueRegex,
) -> bool:
    tree_match = regex.tree_match

    if tree_match and not (
            checkout and
            checkout.tree_name and
            tree_match.search(checkout.tree_name)):
        LOGGER.debug(' tree name "%s" did not match: %s',
                     checkout.tree_name if checkout else None,
                     tree_match)
        return False
    return True


def match(
    log_file: LogFile,
    regex: compiledregex.CompiledIssueRegex,
    cached_search_url: typing.Callable[[str, str, re.Pattern[str]], bool] = search_url,
) -> MatchStatus:
    # pylint: disable=too-many-return-statements,too-many-branches,too-many-statements
    """Match regex against log."""
    text_match = regex.text_match
    test_name_match = regex.test_name_match
    testresult_name_match = regex.testresult_name_match
    file_name_match = regex.file_name_match

    if file_name_match and not (log_file.name and file_name_match.search(log_file.name)):
        LOGGER.debug(' file name "%s" did not match: %s', log_file.name, file_name_match)
        return MatchStatus.NOT_APPLICABLE

    if testresult_name_match and testresult_name_match.pattern != r'.*':
        if log_file.dw_obj.type == 'testresult':
            if not (log_file.dw_obj.comment and
                    testresult_name_match.search(log_file.dw_obj.comment)):
                LOGGER.debug(' testresult name "%s" did not match: %s',
                             log_file.dw_obj.comment, testresult_name_match)
                return MatchStatus.NOT_APPLICABLE
        else:
            LOGGER.debug(" obj.type=%r can't be matched if testresult_name_match != '.*' (%r)",
                         log_file.dw_obj.type, testresult_name_match)
            return MatchStatus.NOT_APPLICABLE

    if test_name_match and test_name_match.pattern != r".*":
        if log_file.dw_obj.type == 'testresult':
            test_name = log_file.dw_obj.misc.get("test_name")
        elif log_file.dw_obj.type == 'test':
            test_name = log_file.dw_obj.comment
        else:
            LOGGER.debug(' obj type %r did not match "test" nor "testresult"', log_file.dw_obj.type)
            return MatchStatus.NOT_APPLICABLE

        if not (test_name and test_name_match.search(test_name)):
            LOGGER.debug(' test name "%s" did not match: %s', test_name, test_name_match)
            return MatchStatus.NOT_APPLICABLE

    if text_match:
        if not log_file.url:
            LOGGER.error(" missing URL in file (%r) therefore didn't match: %s",
                         log_file, text_match)
            return MatchStatus.NO_MATCH

        if not cached_search_url(log_file.url, regex.id, text_match):
            return MatchStatus.NO_MATCH

    # We got this far, do the expensive queries.
    build = None
    checkout = None
    if log_file.dw_obj.type == "checkout":
        checkout = log_file.dw_obj
    elif log_file.dw_obj.type == "build":
        build = log_file.dw_obj
        checkout = cache.get_checkout(cache.get_cache_ttl(), build.checkout_id)
    elif log_file.dw_obj.type in ('test', 'testresult'):
        if log_file.dw_obj.misc.get('related_build'):
            build = log_file.dw_obj.misc.get('related_build')
        elif log_file.dw_obj.type == "testresult":
            assert log_file.dw_test
            build = cache.get_build(cache.get_cache_ttl(), log_file.dw_test.build_id)
        else:
            build = cache.get_build(cache.get_cache_ttl(), log_file.dw_obj.build_id)
        if log_file.dw_obj.misc.get('related_checkout'):
            checkout = log_file.dw_obj.misc.get('related_checkout')
        else:
            checkout = cache.get_checkout(cache.get_cache_ttl(), build.checkout_id)

    if not _match_build(build, regex):
        LOGGER.debug(' associated build properties did not match')
        return MatchStatus.PARTIAL_MATCH

    if not _match_checkout(checkout, regex):
        LOGGER.debug(' associated checkout properties did not match')
        return MatchStatus.PARTIAL_MATCH

    LOGGER.info('The obj=%r successfully matched the regex %r', log_file.dw_obj, regex)

    return MatchStatus.FULL_MATCH


@contextmanager
def _match_time_measure(
    regex_id: str,
) -> typing.Generator[None, None, None]:
    """Measure the time matching a regex."""
    start = time.time()
    try:
        yield
    finally:
        elapsed = time.time() - start
        METRIC_REGEX_MATCH_TIME.labels(regex_id=regex_id).observe(elapsed)
        if elapsed > settings.REGEX_EXPECTED_TIME.total_seconds():
            LOGGER.error("Regex matching took too long. regex_id=%d elapsed_s=%f",
                         regex_id, elapsed)


def _log_files(
    dw_obj: typing.Any,
    log_file_name: str,
    *,
    dw_test: objects.KCIDBTest | None = None,
    dw_obj_child: typing.Any = None,
) -> list[LogFile]:
    """Return all applicable log files for a KCIDB object."""
    log_files = []
    dw_obj_target = dw_obj_child or dw_obj
    if log_url := getattr(dw_obj, 'log_url', None):
        log_files += [LogFile(dw_obj_target, dw_test=dw_test, name=log_file_name, url=log_url)]
    if output_files := getattr(dw_obj, 'output_files', []):
        log_files += [LogFile(dw_obj_target, dw_test=dw_test,  **o) for o in output_files]
    if dw_obj_child and (child_output_files := getattr(dw_obj_child, 'output_files', [])):
        log_files += [LogFile(dw_obj_target, dw_test=dw_test,  **o) for o in child_output_files]
    return log_files


@lru_cache(maxsize=1)
def download(file_url: str) -> str | None:
    """Fetch the content of the given file, truncated to MAX_CONTENT_LENGTH."""
    with misc.only_log_exceptions():
        parsed_url = parse.urlsplit(file_url)
        if parsed_url.scheme == "file":
            return Path(parsed_url.path).read_text(encoding="utf8", errors="backslashreplace")

        # Get just the first chunk up to MAX_CONTENT_LENGTH from the response and normalize newline
        response = SESSION.get(file_url, stream=True)
        if not response.encoding:
            response.encoding = 'utf8'
        content_iterator = response.iter_content(settings.MAX_CONTENT_LENGTH, decode_unicode=True)
        if body := next(content_iterator, None):
            return body.replace("\r\n", "\n").replace("\r", "\n")
    return None
